################################
## Name: 09_estimate_precision.R 
## Purpose: This script calculates precision estimates
## and combines them with recall estimates for the final
## table. It also creates the fine tuning datasets with the 
## examples we labelled for fine tuning. 
## Data In: 
## 1) full data
## data/bioweapons_casestudy_5_20_2024_sbert_embeddings.json
## 2) master coding file for STM pairs: 
## data/precision_stm_test_updated.rds
## 3) master coding file for exact text (ngrams) pairs:
## data/precision_ngrams_test_updated_10_9_2024.rds
## 4) master coding file for sbert cutoffs:
## data/precision_sbert_master.rds
## 5) master coding file for sbert cutoffs, examining
## stricter cutoffs:
## data/precision_sbert_master_recall_cutoff.rds
## 6) master coding file fo  relatio cutoffs
## data/precision_relatio_test.rds
## 7) same claim, same subject annotations
## all files have the form: 
## data/gpt4o_annotations/annotate_lists_[num].rds
## 8) pairs used in same claim fine tuning: 
## data/fine_tune_same_claim_training_data.rds
## 9) pairs used in same subject fine tuning:
## data/fine_tune_same_subject_training_data.rds
## 10) recall estimates for recall training set
## data/recall_master_updated_11_24_2024_with_all_recall_details_public.rds
## 11) recall estimates for recall holdout set
## data/recall_holdout_with_all_recall_estimates_public.rds
## Data Out:
## 1) Fine tuning training data:
## data/fine_tune_examples.rds
## 2) Table 1 in main text and Table A4 in SI
## overall results of estimator performance 

## Notes:


################################
### Files and Dependencies #####

library(openxlsx) ## 4.2.6.1
library(tidyverse) ## 2.0.0
library(xtable) ## 1.8.4

'%ni%' <- Negate("%in%")

## articles
art <- jsonlite::stream_in(file("data/bioweapons_casestudy_5_20_2024_sbert_embeddings.json"))

### master coding files
## stm
stm <- readRDS("data/precision_stm_test_updated.rds")

## ngram
ngram <- readRDS("data/precision_ngrams_test_updated_10_9_2024.rds")

## sbert
sbert <- readRDS("data/precision_sbert_master.rds")

## sbert, stricter cutoffs
sbert_recall <- readRDS("data/precision_sbert_master_recall_cutoff.rds")

## relatio
relatio <- readRDS("data/precision_relatio_test.rds")

## cleaned coded files 
## from precision tests:
out <- readRDS("data/precision_codes_combined_public.rds")

## same claim, same subject annotations
## we include these here because we also created 
## fine tuning training data in this file below
list_annotations <- list.files("data/gpt4o_annotations/")
list_annotations <- list_annotations[grepl("list", list_annotations)]
list_annotations_num <- gsub("annotate_lists_|\\.rds", "", list_annotations)
list_annotations_num <- as.numeric(list_annotations_num)

list_annotations <- paste0("data/gpt4o_annotations/",
                           list_annotations)

## read in lists
art <- split(art, 1:100)

for(i in 1:100){
  index <- which(list_annotations_num == i)
  file_toread <- list_annotations[index]
  file_toread <- readRDS(file_toread)
  subject <- file_toread[[1]]
  subject <- unlist(lapply(subject, function(x){return(x$choices$message.content)}))
  
  claim <- file_toread[[2]]
  claim <- unlist(lapply(claim, function(x){return(x$choices$message.content)}))
  
  art[[i]]$subject_list <- subject
  art[[i]]$claim_list <- claim
}

art <- bind_rows(art)


## for fine tuned model, list of IDs used in fine tuningx
fine_tune_same_claim_data <- readRDS("data/fine_tune_same_claim_training_data.rds")
fine_tune_same_subject_data <- readRDS("data/fine_tune_same_subject_training_data.rds")



## Recall estimates (for combined table)
recall <- readRDS("data/recall_master_updated_11_24_2024_with_all_recall_details_public.rds")
recall_holdout <- readRDS("data/recall_holdout_with_all_recall_estimates_public.rds")


####################################
#### Cleaning Data #################
####################################

## cleaning codes
out$same_subject <- trimws(out$same_subject)
out$same_subject <- tolower(out$same_subject)

out$same_claim <- trimws(out$same_claim)
out$same_claim <- tolower(out$same_claim)
out$same_claim <- dplyr::recode(out$same_claim,
                                `np` = "no")
out$same_claim <- na_if(out$same_claim, "na")

out$final <- ifelse(is.na(out$same_subject),
                    NA, ifelse(out$same_claim == "yes" &
                                 out$same_subject == "yes",
                               "yes", "no"))


## merging in data
## FOR STM we didn't include ego ID, merging in this information
out$ego_id[is.na(out$ego_id)] <- stm$ego_id[match(out$ego[is.na(out$ego_id)],
                                                  stm$ego)]
out$alter_id[is.na(out$alter_id)] <- stm$alter_id[match(out$alter[is.na(out$alter_id)],
                                                  stm$alter)]

## create match id and merge in cutoff data:
mat <- which(colnames(out) == "ego_id" |
               colnames(out) == "alter_id")
out$match_id <- apply(out, 1, function(x){
  x <- x[mat]
  x <- x[order(x, decreasing = TRUE)]
  x <- paste0(x, collapse = "_")
})

## Ngram: 
mat <- which(colnames(ngram) == "ego_id" |
               colnames(ngram) == "alter_id")
ngram$match_id <- apply(ngram, 1, function(x){
  x <- x[mat]
  x <- x[order(x, decreasing = TRUE)]
  x <- paste0(x, collapse = "_")
})

## merge in ngram cutoff data
out$ngram_cutoff[out$measure == "ngrams"] <- ngram$stratum[
  match(out$match_id[out$measure == "ngrams"],
        ngram$match_id)
]

## sbert 
mat <- which(colnames(sbert) == "ego_id" |
               colnames(sbert) == "alter_id")
sbert$match_id <- apply(sbert, 1, function(x){
  x <- x[mat]
  x <- x[order(x, decreasing = TRUE)]
  x <- paste0(x, collapse = "_")
})

## merge in sbert cutoff data:
out$sbert_cutoff[out$measure == "sbert"] <- sbert$cross_score_quartile[
  match(out$match_id[out$measure == "sbert"],
        sbert$match_id)
]

## merge in sbert cutoff data for stricter coding
## version:
mat <- which(colnames(sbert_recall) == "ego_id" |
               colnames(sbert_recall) == "alter_id")
sbert_recall$match_id <- apply(sbert_recall, 1, function(x){
  x <- x[mat]
  x <- x[order(x, decreasing = TRUE)]
  x <- paste0(x, collapse = "_")
})

out$sbert_recall_cutoff[out$measure == "sbertrecall"] <- sbert_recall$cross_score_quartile_recall[
  match(out$match_id[out$measure == "sbertrecall"],
        sbert_recall$match_id)
]

## relatio
mat <- which(colnames(relatio) == "ego_id" |
               colnames(relatio) == "alter_id")
relatio$match_id <- apply(relatio, 1, function(x){
  x <- x[mat]
  x <- x[order(x, decreasing = TRUE)]
  x <- paste0(x, collapse = "_")
})

out$relatio_cutoff[out$measure == "relatio"] <- relatio$stratum[
  match(out$match_id[out$measure == "relatio"],
        relatio$match_id)
]

## ego and alter dates:
out$ego_date <- art$final_date[match(out$ego_id,
                                     art$article_id)]
out$alter_date <- art$final_date[match(out$alter_id,
                                       art$article_id)]


#########################################
#### Intercoder Measure #################
#########################################

out$final <- ifelse(out$final == "yes", 1, 0)

out$same_subject_bin <- ifelse(out$same_subject == "yes", 1, 0)

## take average of votes 
## across coders per coded example
code <- out %>%
  group_by(match_id, 
           measure) %>%
  summarize(ngram_cutoff = unique(ngram_cutoff),
            sbert_cutoff = unique(sbert_cutoff),
            relatio_cutoff = unique(relatio_cutoff),
            sbert_recall_cutoff = unique(sbert_recall_cutoff),
            n = n(),
            n_na = sum(is.na(final)),
            final = mean(final, na.rm = TRUE),
            same_subject = mean(same_subject_bin, na.rm = TRUE))

## which ones where no majority:
tocheck <- which(code$final == .5)
table(code$measure[tocheck]) ## 29 finetune (only 2 coders)
## 1 strict, 9 relatio


## co-lead author Hannah harmonized these:
## to blind myself, I randomize order
## and don't note which sample it was from
## Note: this is how I created this file
## this will not exactly replicate because
## it was based on older version of harmonized
## hand coded file, the exact pairs I needed
## to code didn't change, however

## DON'T RUN, FOR KNOWING WHAT WE DID: 
#set.seed(08540)
#order <- sample(tocheck, length(tocheck), replace = FALSE)
#tocheck <- code$match_id[order]

#tocheck_frame <- out[out$match_id %in% tocheck,
#                     c("match_id",
#                       "ego_summary",
#                       "alter_summary",
#                       "ego_article",
#                       "alter_article")]
#tocheck_frame <- tocheck_frame[!duplicated(tocheck_frame), ]


#write.csv(tocheck_frame,
#          "/scratch/olympus/projects/russia_ukraine_war/bioweapons_new/precision_codes/precision_harmonization_hannah.csv")

## results of this hand coded check: 
harmonized_precision <- read.xlsx("data/precision_harmonization_hannah_coded.xlsx")

harmonized_precision$final <- ifelse(harmonized_precision$same_claim == "yes" &
                                       !is.na(harmonized_precision$same_claim),
                                     1, 0)

## merge back in harmonized labels
new_label <- out[out$match_id %in% harmonized_precision$match_id, ]
new_label <- new_label[!duplicated(new_label[, c("match_id",
                                                 "ego_summary",
                                                 "alter_summary",
                                                 "ego_article",
                                                 "alter_article")]), ]
new_label$same_subject <- harmonized_precision$same_subject[
  match(new_label$match_id,
        harmonized_precision$match_id)
]

new_label$final <- harmonized_precision$final[
  match(new_label$match_id,
        harmonized_precision$match_id)
]

out <- bind_rows(out,
                 new_label)


## recalcuate the majority vote
code <- out %>%
  group_by(match_id, 
           measure) %>%
  summarize(ngram_cutoff = unique(ngram_cutoff),
            sbert_cutoff = unique(sbert_cutoff),
            relatio_cutoff = unique(relatio_cutoff),
            sbert_recall_cutoff = unique(sbert_recall_cutoff),
            n = n(),
            n_na = sum(is.na(final)),
            final = mean(final, na.rm = TRUE),
            same_subject = mean(same_subject_bin, na.rm = TRUE))


code$final <- ifelse(code$final > .5, 
                     "yes", 
                     ifelse(code$final < .5, "no", NA))

code$same_subject <- ifelse(code$same_subject > .5, 
                           "yes", 
                           ifelse(code$same_subject < .5, "no", NA))

## create variable 
## indicating precision comparison 
## group 
code$measure_combined <- paste0(code$measure,
                                code$ngram_cutoff,
                                code$sbert_cutoff,
                                code$sbert_recall_cutoff,
                                code$relatio_cutoff)
code$measure_combined <- gsub("NA", "", code$measure_combined)

##########################################
#### Fine Tuning Data ####################
##########################################

## We repurpose these hand labelled 
## pairs for our fine tuning training data.
## We use these in the code file 06_gpt_fine_tune.R

## this code creates the fine tuning training data
## note: this is based on slightly older harmonized
## code file and it doesn't include the strict sbert coded
## cases 
## as a result it will not exactly replicate. 

fine_tune <- code[!duplicated(code$match_id), ]

fine_tune$ego_summary <- out$ego_summary[match(fine_tune$match_id,
                                               out$match_id)]
fine_tune$alter_summary <- out$alter_summary[match(fine_tune$match_id,
                                                   out$match_id)]

fine_tune$ego_id <- out$ego_id[match(fine_tune$match_id,
                                     out$match_id)]
fine_tune$alter_id <- out$alter_id[match(fine_tune$match_id,
                                         out$match_id)]


fine_tune$ego_subject <- art$subject_list[match(fine_tune$ego_id,
                                                art$article_id)]
fine_tune$alter_subject <- art$subject_list[match(fine_tune$alter_id,
                                                  art$article_id)]

fine_tune$ego_claim <- art$claim_list[match(fine_tune$ego_id,
                                            art$article_id)]
fine_tune$alter_claim  <-art$claim_list[match(fine_tune$alter_id,
                                              art$article_id)]


#saveRDS(fine_tune, "data/fine_tune_examples.rds")



###########################################
### Calculating Precision  ################
###########################################

## for the gpt4o fine tuning precision cases
## we need to merge in data on whether 
## the articles/pairs were used in fine tuning training
## for gpt4o fine tune only, any precision examples 
## that were used in fine tuning we exclude from our precision
## test. We did this to prevent data leakage. 

## limiting training data to training examples 
## (fine-tuning in 06_gpt_fine_tune.R
## we kept some pairs held out for
## test validation, those pairs were not seen in 
## fine tuning so we don't need to exclude them)
fine_tune_same_claim_data <- fine_tune_same_claim_data %>%
  filter(train_test == "train")
fine_tune_same_subject_data <- fine_tune_same_subject_data %>%
  filter(train_test == "train")

## create variable: 
## were pairs we coded for evaluating gpt4o
## fine tuning
## mistakenly used in training?
code$fine_tuning_gpt4o_training <- ifelse(code$measure_combined !=
                                            "gpt4o_finetune",
                                          NA, 
                                          ifelse(code$match_id %in% c(fine_tune_same_claim_data$match_id,
                                                                      fine_tune_same_subject_data$match_id),
                                                 "training data",
                                                 "not training data"))
table(code$fine_tuning_gpt4o_training) ## 2 cases

## create second variable:
## were any of the individual articles used
## in training?
## This is a more stringent measure of leakage,
## i.e. whether in a given evaluation pair whether 
## a single article was use in one of the training pairs
## as ego or alter: 

code$ego_id <- unlist(lapply(code$match_id, function(x){
  x <- unlist(str_split(x, "_"))
  return(x[1])
}))
code$alter_id <- unlist(lapply(code$match_id, function(x){
  x <- unlist(str_split(x, "_"))
  return(x[2])
}))


code$fine_tuning_gpt4o_training_stringent <- ifelse(code$measure_combined !=
                                            "gpt4o_finetune",
                                          NA, 
                                          ifelse(code$ego_id %in% c(fine_tune_same_claim_data$ego_id,
                                                                      fine_tune_same_claim_data$alter_id,
                                                                      fine_tune_same_subject_data$ego_id,
                                                                    fine_tune_same_subject_data$alter_id) |
                                                   code$alter_id %in% c(fine_tune_same_claim_data$ego_id,
                                                                        fine_tune_same_claim_data$alter_id,
                                                                        fine_tune_same_subject_data$ego_id,
                                                                        fine_tune_same_subject_data$alter_id),
                                                 "training data",
                                                 "not training data"))


table(code$fine_tuning_gpt4o_training_stringent) ## 15 cases

## removing actual training examples 
## - these should have been previously excluded
code$final[code$measure_combined == "gpt4o_finetune" &
             code$fine_tuning_gpt4o_training == "training data"] <- NA

## creating a stringent measure - will only be different for fine tuning
## evaluation cases 
code$final_stringent <- code$final
code$final_stringent[code$measure_combined == "gpt4o_finetune" &
             code$fine_tuning_gpt4o_training_stringent == "training data"] <- NA


## Calculating precision (note that for GPT4o fine tune 
## we decided to use the stringent measure, excluding the 15 cases)
precision <- code %>%
  group_by(measure_combined) %>%
  summarize(n = sum(!is.na(final_stringent)),
            true_positive = sum(final_stringent == "yes",
                                na.rm = TRUE),
            Precision = true_positive / n,
            measure = unique(measure))


## for ngram, relatio, and sberts we need to
## weight precision estimates by the overall proportion of pairs
## in each bucket
## excluding less strict version of sbert 
## because that was from an older version
## of cutoffs we didn't consider because too low (i.e. almost no 
## true positive cases)

precision_cutoffs <- precision %>%
  filter(measure %in% c("ngrams",
                        "relatio",
                        "sbertrecall"))

## population data (number of actual pairs
## in each bin and overall number of pairs) 
## when we labeled precision cases we did stratified random sampling
## for selecting pairs to label
## so for example for the .2 ngram estimator
## we sampled pairs between .2 and .4, .4 and .6, and .6+
## we want to calculate the overall precision of .2 as a cutoff
## so we need to weight the precision estimate by the 
## overall prevalence of each bin 
## there were 2,114 pairs with at least .2 cosine sim for the ngram estimator
## 1,368 between .2 and .4, 556 between .4 and .6, 190 .6+
## we take our precision estimates for each bin and weight by the prevalence
## of each bin within the 2,114 pairs 
## this information was saved in the labelled data 

sbert_population <- sbert_recall %>%
  group_by(cross_score_quartile_recall) %>%
  summarize(count = unique(cross_score_quartile_recall_denominator_counts))
colnames(sbert_population)[1] <- "stratum"
sbert_population$total <- sum(sbert_population$count)
sbert_population$measure <- "sbertrecall"

relatio_population <- relatio %>%
  group_by(stratum) %>%
  summarize(count = unique(stratum_denominator))
relatio_population$total <- sum(relatio_population$count)
relatio_population$measure <- "relatio"

ngram_population <- ngram %>%
  group_by(stratum) %>%
  summarize(count = unique(stratum_denominator_counts))
ngram_population$total <- sum(ngram_population$count)
ngram_population$measure <- "ngrams"

population <- bind_rows(relatio_population,
                        ngram_population,
                        sbert_population)
precision_cutoffs$stratum <- gsub("ngrams|relatio|sbertrecall",
                                  "", precision_cutoffs$measure_combined)

## joining information on population percentages
precision_cutoffs <- left_join(precision_cutoffs,
                               population,
                               by = c("stratum", "measure"))

## this code does the reweighting 
precision_cutoffs$Precision_reweighted <- NA
precision_cutoffs$`Total Predicted Pairs` <- NA
precision_cutoffs <- split(precision_cutoffs, 
                           precision_cutoffs$measure)

for(i in 1:length(precision_cutoffs)){
  len <- nrow(precision_cutoffs[[i]])
  
  ## for each cutoff (e.g. .2+, .4+, .6+)
  for(j in 1:len){
    ## total number in each bin
    proportions <- precision_cutoffs[[i]]$count[j:len]
    ## total across all bins
    total <- sum(proportions)
    ## proportion in each bin
    proportions <- proportions / total
    ## take weighted average of precision estimates
    ## by 1) multiplying each precision estimate
    ## by the population proportion for that bin
    re_weight <- precision_cutoffs[[i]]$Precision[j:len]*proportions
    ## 2) summing these weighted proportions across the bins 
    precision_cutoffs[[i]]$Precision_reweighted[j] <- sum(re_weight)
    precision_cutoffs[[i]]$`Total Predicted Pairs`[j] <- total
  }
}
precision_cutoffs <- bind_rows(precision_cutoffs)

## limit to columns 
precision_cutoffs <- precision_cutoffs %>%
  dplyr::select(measure_combined,
                n,
                true_positive,
                Precision_reweighted,
                measure,
                `Total Predicted Pairs`)
colnames(precision_cutoffs)[4] <- "Precision"


## pulling together weighted and non-weighted
## precision estimates
## here we identify the precision estimates
## that didn't need to be reweighted
## we exclude "gpt4o_strict", as 
## this was a zero shot
## annotation where we used a stricter annotation prompt
## because we don't report those in the paper
## (found little difference from the zero shot prompt we
## used)

precision_no_cutoffs <- precision %>%
  filter(measure %ni% c("ngrams",
                        "relatio",
                        "sbert",
                        "sbertrecall",
                        "gpt4o_strict"))



precision <- bind_rows(precision_cutoffs,
                       precision_no_cutoffs)

## manually input total predicted pairs for
## GPT models and STM
## calculated in estimate_recall file
## line 230
precision$`Total Predicted Pairs`[precision$measure == "gpt4o_finetune"] <- 4204
precision$`Total Predicted Pairs`[precision$measure == "gpt4o_original"] <- 18138
precision$`Total Predicted Pairs`[precision$measure == "stm"] <- 34210

###################################
### Merging in Recall Data ########
###################################

### merging in recall data

## calculating recall estimates for 
## sbert cutoffs creating indicator whether recall data
table(recall$out_cross_score_quartile_recall)
recall$sbert_1st <- !is.na(recall$out_cross_score_quartile_recall)
recall$sbert_2nd <- !is.na(recall$out_cross_score_quartile_recall) &
  recall$out_cross_score_quartile_recall != "1st Quartile Recall"
recall$sbert_3rd <- !is.na(recall$out_cross_score_quartile_recall) &
  recall$out_cross_score_quartile_recall != "1st Quartile Recall" &
  recall$out_cross_score_quartile_recall != "2nd Quartile Recall"
recall$sbert_4th <- !is.na(recall$out_cross_score_quartile_recall) &
  recall$out_cross_score_quartile_recall == "4th Quartile Recall" 

recall_holdout$sbert_1st <- !is.na(recall_holdout$out_cross_score_quartile_recall)
recall_holdout$sbert_2nd <- !is.na(recall_holdout$out_cross_score_quartile_recall) &
  recall_holdout$out_cross_score_quartile_recall != "1st Quartile Recall"
recall_holdout$sbert_3rd <- !is.na(recall_holdout$out_cross_score_quartile_recall) &
  recall_holdout$out_cross_score_quartile_recall != "1st Quartile Recall" &
  recall_holdout$out_cross_score_quartile_recall != "2nd Quartile Recall"
recall_holdout$sbert_4th <- !is.na(recall_holdout$out_cross_score_quartile_recall) &
  recall_holdout$out_cross_score_quartile_recall == "4th Quartile Recall" 

## creating long version
## of recall data
recall <- recall %>%
  dplyr::select(llm_zero_shot,
         llm_fine_tune,
         llm_fine_tune_stringent,
         ngram_2,
         ngram_4,
         ngram_6,
         STM_cluster,
         relatio_1,
         relatio_2,
         relatio_4,
         relatio_6,
        sbert_1st,
        sbert_2nd,
        sbert_3rd,
        sbert_4th) %>%
  gather(key = "measure",
         value = "recall") %>%
  group_by(measure) %>%
  summarize(n = sum(!is.na(recall)),
            recall = sum(recall, na.rm = TRUE) / n)



recall_holdout <- recall_holdout %>%
  dplyr::select(llm_zero_shot,
         llm_fine_tune,
         llm_fine_tune_stringent,
         ngram_2,
         ngram_4,
         ngram_6,
         STM_cluster,
         relatio_1,
         relatio_2,
         relatio_4,
         relatio_6,
         sbert_1st,
         sbert_2nd,
         sbert_3rd,
         sbert_4th) %>%
  gather(key = "measure",
         value = "recall_holdout") %>%
  group_by(measure) %>%
  summarize(n = sum(!is.na(recall_holdout)),
            recall_holdout = sum(recall_holdout, na.rm = TRUE) / n)



## merge in recall data to precision 
## estimates 
recall$measure <- dplyr::recode(recall$measure,
                                `ngram_2` = "ngrams.2 < .4",
                                `ngram_4` = "ngrams.4 < .6",
                                `ngram_6` = "ngrams.6+",
                                `relatio_1` = "relatio.1 < .2",
                                `relatio_2` = "relatio.2 < .4",
                                `relatio_4` = "relatio.4 < .6",
                                `relatio_6` = "relatio.6+",
                                `sbert_1st` = "sbertrecall1st Quartile Recall",
                                `sbert_2nd` = "sbertrecall2nd Quartile Recall",
                                `sbert_3rd` = "sbertrecall3rd Quartile Recall",
                                `sbert_4th` = "sbertrecall4th Quartile Recall",
                                `llm_fine_tune` = "gpt4o_finetune",
                                `llm_zero_shot` = "gpt4o_original",
                                `STM_cluster` = "stm")

recall_holdout$measure <- dplyr::recode(recall_holdout$measure,
                                `ngram_2` = "ngrams.2 < .4",
                                `ngram_4` = "ngrams.4 < .6",
                                `ngram_6` = "ngrams.6+",
                                `relatio_1` = "relatio.1 < .2",
                                `relatio_2` = "relatio.2 < .4",
                                `relatio_4` = "relatio.4 < .6",
                                `relatio_6` = "relatio.6+",
                                `sbert_1st` = "sbertrecall1st Quartile Recall",
                                `sbert_2nd` = "sbertrecall2nd Quartile Recall",
                                `sbert_3rd` = "sbertrecall3rd Quartile Recall",
                                `sbert_4th` = "sbertrecall4th Quartile Recall",
                                `llm_fine_tune` = "gpt4o_finetune",
                                `llm_zero_shot` = "gpt4o_original",
                                `STM_cluster` = "stm")

precision$recall <- recall$recall[match(precision$measure_combined,
                                        recall$measure)]
precision$recall_holdout <- recall_holdout$recall_holdout[match(precision$measure_combined,
                                        recall_holdout$measure)]

## Note: for recall estimates with fine tuned annotator,
## we report the stringent version of recall estimates
## separate from main table 
## (i.e. the version where we excluded any recall pairs
## if one of the articles in the pair was included in fine tuning
## this is 
## because the number of pairs significantly declines (we created the
## recall set before we did the fine tuning)
## in no cases were pairs actually included in the fine tuning

recall[recall$measure == "llm_fine_tune_stringent",] ## 52.3%, 44 pairs considered
recall_holdout[recall_holdout$measure == "llm_fine_tune_stringent",] ## 53.8% 26 pairs considered 

## compared with 
recall[recall$measure == "gpt4o_finetune",] ## 43%, 121 pairs considered
recall_holdout[recall_holdout$measure == "gpt4o_finetune",] ## 48.9%, 47 pairs considered

## in paper we report for a few measures
## the denominators for these estiamtes
recall[recall$measure == "stm",] ## 50/121 - 41.3
recall_holdout[recall_holdout$measure == "stm",]  # 27.7%

recall[recall$measure == "relatio.1 < .2",]
121 * .413 ## 50 pairs
recall_holdout[recall_holdout$measure == "relatio.1 < .2",]
16 / 47 ## 16 pairs


## Calculating F1
## We calculate (for main results) F1 scores based on holdout recall
## and precision scores
precision$F1 <- 2*(precision$Precision * precision$recall_holdout) /
  (precision$Precision + precision$recall_holdout)

## for estimators with multiple cutoffs, we choose the optimal estimator
## based on non-heldout recall and precision F1
precision$F1_alt <- 2*(precision$Precision * precision$recall) /
  (precision$Precision + precision$recall)

precision %>%
  group_by(measure) %>%
  summarize(max = max(F1_alt),
            measure_combined = measure_combined[F1_alt == max])
## text reuse: .2 cosine similarity
## relatio: .1
## sbert: 3rd quartile 


## formatting table
precision <- precision %>%
  dplyr::select(measure_combined,
         F1,
         F1_alt,
         Precision,
         recall,
         recall_holdout,
         `Total Predicted Pairs`)

## multiplying all scores by 100 
precision$F1 <- precision$F1 * 100
precision$F1_alt <- precision$F1_alt * 100
precision$Precision <- precision$Precision * 100
precision$recall <- precision$recall * 100
precision$recall_holdout <- precision$recall_holdout * 100

## table displayed in Table 1 in main text 
## and in SI, Table A4
xtable(precision)





